[Autogluon] df50

Author

김보람

Published

September 20, 2023

imports

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import networkx as nx
import sklearn
import xgboost as xgb

# sklearn
from sklearn import model_selection # split함수이용
from sklearn import ensemble # RF,GBM
from sklearn import metrics
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB

# gnn
import torch
import torch.nn.functional as F
import torch_geometric
from torch_geometric.nn import GCNConv

# autogluon
from autogluon.tabular import TabularDataset, TabularPredictor
def down_sample_textbook(df):
    df_majority = df[df.is_fraud==0].copy()
    df_minority = df[df.is_fraud==1].copy()
    df_maj_dowsampled = sklearn.utils.resample(df_majority, n_samples=len(df_minority), replace=False, random_state=42)
    df_downsampled = pd.concat([df_minority, df_maj_dowsampled])
    return df_downsampled

def compute_time_difference(group):
    n = len(group)
    result = []
    for i in range(n):
        for j in range(n):
            time_difference = abs(group.iloc[i].trans_date_trans_time.value - group.iloc[j].trans_date_trans_time.value)
            result.append([group.iloc[i].name, group.iloc[j].name, time_difference])
    return result


class GCN(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = GCNConv(1, 16)
        self.conv2 = GCNConv(16,2)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index

        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index)

        return F.log_softmax(x, dim=1)
fraudTrain = pd.read_csv("~/Desktop/fraudTrain.csv").iloc[:,1:]
fraudTrain = fraudTrain.assign(trans_date_trans_time= list(map(lambda x: pd.to_datetime(x), fraudTrain.trans_date_trans_time)))
fraudTrain
trans_date_trans_time cc_num merchant category amt first last gender street city ... lat long city_pop job dob trans_num unix_time merch_lat merch_long is_fraud
0 2019-01-01 00:00:00 2.703190e+15 fraud_Rippin, Kub and Mann misc_net 4.97 Jennifer Banks F 561 Perry Cove Moravian Falls ... 36.0788 -81.1781 3495 Psychologist, counselling 1988-03-09 0b242abb623afc578575680df30655b9 1325376018 36.011293 -82.048315 0
1 2019-01-01 00:00:00 6.304230e+11 fraud_Heller, Gutmann and Zieme grocery_pos 107.23 Stephanie Gill F 43039 Riley Greens Suite 393 Orient ... 48.8878 -118.2105 149 Special educational needs teacher 1978-06-21 1f76529f8574734946361c461b024d99 1325376044 49.159047 -118.186462 0
2 2019-01-01 00:00:00 3.885950e+13 fraud_Lind-Buckridge entertainment 220.11 Edward Sanchez M 594 White Dale Suite 530 Malad City ... 42.1808 -112.2620 4154 Nature conservation officer 1962-01-19 a1a22d70485983eac12b5b88dad1cf95 1325376051 43.150704 -112.154481 0
3 2019-01-01 00:01:00 3.534090e+15 fraud_Kutch, Hermiston and Farrell gas_transport 45.00 Jeremy White M 9443 Cynthia Court Apt. 038 Boulder ... 46.2306 -112.1138 1939 Patent attorney 1967-01-12 6b849c168bdad6f867558c3793159a81 1325376076 47.034331 -112.561071 0
4 2019-01-01 00:03:00 3.755340e+14 fraud_Keeling-Crist misc_pos 41.96 Tyler Garcia M 408 Bradley Rest Doe Hill ... 38.4207 -79.4629 99 Dance movement psychotherapist 1986-03-28 a41d7549acf90789359a9aa5346dcb46 1325376186 38.674999 -78.632459 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
1048570 2020-03-10 16:07:00 6.011980e+15 fraud_Fadel Inc health_fitness 77.00 Haley Wagner F 05561 Farrell Crescent Annapolis ... 39.0305 -76.5515 92106 Accountant, chartered certified 1943-05-28 45ecd198c65e81e597db22e8d2ef7361 1362931649 38.779464 -76.317042 0
1048571 2020-03-10 16:07:00 4.839040e+15 fraud_Cremin, Hamill and Reichel misc_pos 116.94 Meredith Campbell F 043 Hanson Turnpike Hedrick ... 41.1826 -92.3097 1583 Geochemist 1999-06-28 c00ce51c6ebb7657474a77b9e0b51f34 1362931670 41.400318 -92.726724 0
1048572 2020-03-10 16:08:00 5.718440e+11 fraud_O'Connell, Botsford and Hand home 21.27 Susan Mills F 005 Cody Estates Louisville ... 38.2507 -85.7476 736284 Engineering geologist 1952-04-02 17c9dc8b2a6449ca2473726346e58e6c 1362931711 37.293339 -84.798122 0
1048573 2020-03-10 16:08:00 4.646850e+18 fraud_Thompson-Gleason health_fitness 9.52 Julia Bell F 576 House Crossroad West Sayville ... 40.7320 -73.1000 4056 Film/video editor 1990-06-25 5ca650881b48a6a38754f841c23b77ab 1362931718 39.773077 -72.213209 0
1048574 2020-03-10 16:08:00 2.283740e+15 fraud_Buckridge PLC misc_pos 6.81 Shannon Williams F 9345 Spencer Junctions Suite 183 Alpharetta ... 34.0770 -84.3033 165556 Prison officer 1997-12-27 8d0a575fe635bbde12f1a2bffc126731 1362931730 33.601468 -83.891921 0

1048575 rows × 22 columns

데이터정리

_df1 = fraudTrain[fraudTrain["is_fraud"] == 0].sample(frac=0.20, random_state=42)
_df2 = fraudTrain[fraudTrain["is_fraud"] == 1]
df02 = pd.concat([_df1,_df2])
df02.shape
(214520, 22)
df50 = down_sample_textbook(df02)
df50.shape
(12012, 22)
df50 = df50.reset_index()
N = len(df50)
df50 = df50[["amt","is_fraud"]]
df50["amt"].mean()
297.4638911088911
df50["amt"].describe()
count    12012.000000
mean       297.463891
std        384.130842
min          1.010000
25%         19.917500
50%         84.680000
75%        468.295000
max      12025.300000
Name: amt, dtype: float64

tr/test

df50_tr,df50_test = sklearn.model_selection.train_test_split(df50, random_state=42)
df50_tr.shape, df50_test.shape
((9009, 2), (3003, 2))
train_mask = [i in df50_tr.index for i in range(N)]
test_mask = [i in df50_test.index for i in range(N)]
train_mask = np.array(train_mask)
test_mask = np.array(test_mask)
train_mask.sum(), test_mask.sum()
(9009, 3003)
train_mask.shape, test_mask.shape
((12012,), (12012,))

edge_index 설정

# groups = df50.groupby('cc_num')
# edge_index_list_plus = [compute_time_difference(group) for _, group in groups]
# edge_index_list_plus_flat = [item for sublist in edge_index_list_plus for item in sublist]
# edge_index_list_plus_nparr = np.array(edge_index_list_plus_flat)
# np.save('edge_index_list_plus50.npy', edge_index_list_plus_nparr)
edge_index = np.load('edge_index_list_plus50.npy')
theta = edge_index[:,2].mean()
edge_index = np.load('edge_index_list_plus50.npy').astype(np.float64)
edge_index[:,2] = (np.exp(-edge_index[:,2]/theta) != 1)*(np.exp(-edge_index[:,2]/theta))
edge_index = edge_index.tolist()
mean_ = np.array(edge_index)[:,2].mean()
selected_edges = [(int(row[0]), int(row[1])) for row in edge_index if row[2] > mean_]
edge_index_selected = torch.tensor(selected_edges, dtype=torch.long).t()

data설정(x, edge_index, y)

x = torch.tensor(df50['amt'], dtype=torch.float).reshape(-1,1)
y = torch.tensor(df50['is_fraud'],dtype=torch.int64)
data = torch_geometric.data.Data(x=x, edge_index = edge_index_selected, y=y, train_mask = train_mask, test_mask = test_mask)
data
Data(x=[12012, 1], edge_index=[2, 93730], y=[12012], train_mask=[12012], test_mask=[12012])

autogluon

A. 데이터

tr = TabularDataset(df50_tr)
tst = TabularDataset(df50_test)

B. predictor 생성

predictr = TabularPredictor("is_fraud")
No path specified. Models will be saved in: "AutogluonModels/ag-20231101_160221/"

C.적합(fit)

predictr.fit(tr) 
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels/ag-20231101_160221/"
AutoGluon Version:  0.8.2
Python Version:     3.8.18
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #26~22.04.1-Ubuntu SMP PREEMPT_DYNAMIC Thu Jul 13 16:27:29 UTC 2
Disk Space Avail:   690.98 GB / 982.82 GB (70.3%)
Train Data Rows:    9009
Train Data Columns: 1
Label Column: is_fraud
Preprocessing data ...
AutoGluon infers your prediction problem is: 'binary' (because only two unique label-values observed).
    2 unique label values:  [1, 0]
    If 'binary' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Selected class <--> label mapping:  class 1 = 1, class 0 = 0
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
    Available Memory:                    112324.67 MB
    Train Data (Original)  Memory Usage: 0.07 MB (0.0% of available memory)
    Inferring data type of each feature based on column values. Set feature_metadata_in to manually specify special dtypes of the features.
    Stage 1 Generators:
        Fitting AsTypeFeatureGenerator...
    Stage 2 Generators:
        Fitting FillNaFeatureGenerator...
    Stage 3 Generators:
        Fitting IdentityFeatureGenerator...
    Stage 4 Generators:
        Fitting DropUniqueFeatureGenerator...
    Stage 5 Generators:
        Fitting DropDuplicatesFeatureGenerator...
    Types of features in original data (raw dtype, special dtypes):
        ('float', []) : 1 | ['amt']
    Types of features in processed data (raw dtype, special dtypes):
        ('float', []) : 1 | ['amt']
    0.0s = Fit runtime
    1 features in original data used to generate 1 features in processed data.
    Train Data (Processed) Memory Usage: 0.07 MB (0.0% of available memory)
Data preprocessing and feature engineering runtime = 0.02s ...
AutoGluon will gauge predictive performance using evaluation metric: 'accuracy'
    To change this, specify the eval_metric parameter of Predictor()
Automatically generating train/validation split with holdout_frac=0.1, Train Rows: 8108, Val Rows: 901
User-specified model hyperparameters to be fit:
{
    'NN_TORCH': {},
    'GBM': [{'extra_trees': True, 'ag_args': {'name_suffix': 'XT'}}, {}, 'GBMLarge'],
    'CAT': {},
    'XGB': {},
    'FASTAI': {},
    'RF': [{'criterion': 'gini', 'ag_args': {'name_suffix': 'Gini', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'entropy', 'ag_args': {'name_suffix': 'Entr', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'squared_error', 'ag_args': {'name_suffix': 'MSE', 'problem_types': ['regression', 'quantile']}}],
    'XT': [{'criterion': 'gini', 'ag_args': {'name_suffix': 'Gini', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'entropy', 'ag_args': {'name_suffix': 'Entr', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'squared_error', 'ag_args': {'name_suffix': 'MSE', 'problem_types': ['regression', 'quantile']}}],
    'KNN': [{'weights': 'uniform', 'ag_args': {'name_suffix': 'Unif'}}, {'weights': 'distance', 'ag_args': {'name_suffix': 'Dist'}}],
}
Fitting 13 L1 models ...
Fitting model: KNeighborsUnif ...
    0.8779   = Validation score   (accuracy)
    0.01s    = Training   runtime
    0.01s    = Validation runtime
Fitting model: KNeighborsDist ...
    0.8635   = Validation score   (accuracy)
    0.0s     = Training   runtime
    0.0s     = Validation runtime
Fitting model: LightGBMXT ...
    0.8768   = Validation score   (accuracy)
    0.17s    = Training   runtime
    0.0s     = Validation runtime
Fitting model: LightGBM ...
    0.8923   = Validation score   (accuracy)
    0.25s    = Training   runtime
    0.0s     = Validation runtime
Fitting model: RandomForestGini ...
    0.8513   = Validation score   (accuracy)
    0.3s     = Training   runtime
    0.03s    = Validation runtime
Fitting model: RandomForestEntr ...
    0.8513   = Validation score   (accuracy)
    0.31s    = Training   runtime
    0.03s    = Validation runtime
Fitting model: CatBoost ...
    0.8946   = Validation score   (accuracy)
    0.71s    = Training   runtime
    0.0s     = Validation runtime
Fitting model: ExtraTreesGini ...
    0.8602   = Validation score   (accuracy)
    0.28s    = Training   runtime
    0.03s    = Validation runtime
Fitting model: ExtraTreesEntr ...
    0.8579   = Validation score   (accuracy)
    0.28s    = Training   runtime
    0.03s    = Validation runtime
Fitting model: NeuralNetFastAI ...
No improvement since epoch 1: early stopping
    0.8635   = Validation score   (accuracy)
    2.82s    = Training   runtime
    0.01s    = Validation runtime
Fitting model: XGBoost ...
    0.8935   = Validation score   (accuracy)
    0.1s     = Training   runtime
    0.0s     = Validation runtime
Fitting model: NeuralNetTorch ...
    0.8857   = Validation score   (accuracy)
    4.62s    = Training   runtime
    0.0s     = Validation runtime
Fitting model: LightGBMLarge ...
    0.8946   = Validation score   (accuracy)
    0.31s    = Training   runtime
    0.0s     = Validation runtime
Fitting model: WeightedEnsemble_L2 ...
    0.9023   = Validation score   (accuracy)
    0.48s    = Training   runtime
    0.0s     = Validation runtime
AutoGluon training complete, total runtime = 10.98s ... Best model: "WeightedEnsemble_L2"
TabularPredictor saved. To load, use: predictor = TabularPredictor.load("AutogluonModels/ag-20231101_160221/")
<autogluon.tabular.predictor.predictor.TabularPredictor at 0x7fd21ca03b80>
predictr.leaderboard()
                  model  score_val  pred_time_val  fit_time  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0   WeightedEnsemble_L2   0.902331       0.047529  6.681587                0.001262           0.483885            2       True         14
1         LightGBMLarge   0.894562       0.001690  0.307810                0.001690           0.307810            1       True         13
2              CatBoost   0.894562       0.002187  0.705743                0.002187           0.705743            1       True          7
3               XGBoost   0.893452       0.003370  0.104729                0.003370           0.104729            1       True         11
4              LightGBM   0.892342       0.003417  0.250978                0.003417           0.250978            1       True          4
5        NeuralNetTorch   0.885683       0.004299  4.621580                0.004299           4.621580            1       True         12
6        KNeighborsUnif   0.877913       0.005800  0.008693                0.005800           0.008693            1       True          1
7            LightGBMXT   0.876804       0.002110  0.170732                0.002110           0.170732            1       True          3
8        KNeighborsDist   0.863485       0.004718  0.004118                0.004718           0.004118            1       True          2
9       NeuralNetFastAI   0.863485       0.010838  2.824286                0.010838           2.824286            1       True         10
10       ExtraTreesGini   0.860155       0.027893  0.282991                0.027893           0.282991            1       True          8
11       ExtraTreesEntr   0.857936       0.027677  0.275942                0.027677           0.275942            1       True          9
12     RandomForestEntr   0.851276       0.027698  0.310881                0.027698           0.310881            1       True          6
13     RandomForestGini   0.851276       0.028313  0.296602                0.028313           0.296602            1       True          5
model score_val pred_time_val fit_time pred_time_val_marginal fit_time_marginal stack_level can_infer fit_order
0 WeightedEnsemble_L2 0.902331 0.047529 6.681587 0.001262 0.483885 2 True 14
1 LightGBMLarge 0.894562 0.001690 0.307810 0.001690 0.307810 1 True 13
2 CatBoost 0.894562 0.002187 0.705743 0.002187 0.705743 1 True 7
3 XGBoost 0.893452 0.003370 0.104729 0.003370 0.104729 1 True 11
4 LightGBM 0.892342 0.003417 0.250978 0.003417 0.250978 1 True 4
5 NeuralNetTorch 0.885683 0.004299 4.621580 0.004299 4.621580 1 True 12
6 KNeighborsUnif 0.877913 0.005800 0.008693 0.005800 0.008693 1 True 1
7 LightGBMXT 0.876804 0.002110 0.170732 0.002110 0.170732 1 True 3
8 KNeighborsDist 0.863485 0.004718 0.004118 0.004718 0.004118 1 True 2
9 NeuralNetFastAI 0.863485 0.010838 2.824286 0.010838 2.824286 1 True 10
10 ExtraTreesGini 0.860155 0.027893 0.282991 0.027893 0.282991 1 True 8
11 ExtraTreesEntr 0.857936 0.027677 0.275942 0.027677 0.275942 1 True 9
12 RandomForestEntr 0.851276 0.027698 0.310881 0.027698 0.310881 1 True 6
13 RandomForestGini 0.851276 0.028313 0.296602 0.028313 0.296602 1 True 5

D. 예측(predict)

(tr.is_fraud == predictr.predict(tr)).mean()
0.9102009102009102
(tst.is_fraud == predictr.predict(tst)).mean()
0.8904428904428905
yyhat = predictr.predict(tr)

autogluon이렇게 하는게 맞는감…;;;

from sklearn.metrics import f1_score

f1_scores = {}

for model_name in model_list:
    model = predictor.load_model(model_name)  # 각 모델을 불러옵니다.
    y_pred = model.predict(tr)
    f1 = f1_score(y_true, y_pred)  # F1 스코어를 계산합니다.
    f1_scores[model_name] = f1

# 개별 모델의 F1 스코어 출력
for model_name, f1 in f1_scores.items():
    print(f"Model: {model_name}, F1 Score: {f1}")
NameError: name 'model_list' is not defined